# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
C:\Users\HP\anaconda3\Lib\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed). from pandas.core import (
# Load data
df = pd.read_csv("wireless_churn.csv")
df.head()
| AccountWeeks | ContractRenewal | DataPlan | DataUsage | CustServCalls | DayMins | DayCalls | MonthlyCharge | OverageFee | RoamMins | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | 1 | 1 | 2.7 | 1 | 265.1 | 110 | 89.0 | 9.87 | 10.0 | 0 |
| 1 | 107 | 1 | 1 | 3.7 | 1 | 161.6 | 123 | 82.0 | 9.78 | 13.7 | 0 |
| 2 | 137 | 1 | 0 | 0.0 | 0 | 243.4 | 114 | 52.0 | 6.06 | 12.2 | 0 |
| 3 | 84 | 0 | 0 | 0.0 | 2 | 299.4 | 71 | 57.0 | 3.10 | 6.6 | 0 |
| 4 | 75 | 0 | 0 | 0.0 | 3 | 166.7 | 113 | 41.0 | 7.42 | 10.1 | 0 |
# Create Profile Report
import ydata_profiling
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Wiseless_churn Profile Report")
profile.to_widgets()
profile.to_notebook_iframe()
C:\Users\HP\anaconda3\Lib\site-packages\numba\core\decorators.py:262: NumbaDeprecationWarning: numba.generated_jit is deprecated. Please see the documentation at: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-generated-jit for more information and advice on a suitable replacement. warnings.warn(msg, NumbaDeprecationWarning) C:\Users\HP\anaconda3\Lib\site-packages\visions\backends\shared\nan_handling.py:50: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details. @nb.jit
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render widgets: 0%| | 0/1 [00:00<?, ?it/s]
VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
# Step 1: Remove Outliers
iso = IsolationForest(contamination=0.02, random_state=100)
outliers = iso.fit_predict(df.drop('Churn', axis=1))
df_clean = df[outliers == 1]
# Step 2: Prepare Data
X = df_clean.drop("Churn", axis=1)
y = df_clean["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Step 3: Learning Curves
%matplotlib inline
def plot_learning_curve(estimator, X, y, title):
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=5, scoring='recall_weighted',
train_sizes=np.linspace(0.1, 1.0, 5), random_state=100
)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.figure()
plt.plot(train_sizes, train_scores_mean, 'o-', label="Training recall")
plt.plot(train_sizes, test_scores_mean, 'o-', label="Cross-validation recall")
plt.title(title)
plt.xlabel("Training Examples")
plt.ylabel("Recall")
plt.legend(loc="best")
plt.grid()
plt.show()
log_reg = LogisticRegression(solver='lbfgs', class_weight='balanced', max_iter=1000, random_state=100)
nb = GaussianNB()
plot_learning_curve(log_reg, X_train_scaled, y_train, "Learning Curve - Logistic Regression")
plot_learning_curve(nb, X_train_scaled, y_train, "Learning Curve - Naive Bayes")
# Step 4: Optimized Models
log_reg.fit(X_train_scaled, y_train)
nb.fit(X_train_scaled, y_train)
y_pred_log = log_reg.predict(X_test_scaled)
y_pred_nb = nb.predict(X_test_scaled)
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_log))
print("ROC AUC:", roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:,1]))
print("\nNaive Bayes Report:")
print(classification_report(y_test, y_pred_nb))
print("ROC AUC:", roc_auc_score(y_test, nb.predict_proba(X_test_scaled)[:,1]))
Logistic Regression Report:
precision recall f1-score support
0 0.95 0.78 0.86 562
1 0.36 0.76 0.49 92
accuracy 0.78 654
macro avg 0.66 0.77 0.67 654
weighted avg 0.87 0.78 0.81 654
ROC AUC: 0.8372466346897726
Naive Bayes Report:
precision recall f1-score support
0 0.91 0.95 0.93 562
1 0.59 0.45 0.51 92
accuracy 0.88 654
macro avg 0.75 0.70 0.72 654
weighted avg 0.87 0.88 0.87 654
ROC AUC: 0.8573804734643354
# Step 5: Voting Ensemble
ada = AdaBoostClassifier(n_estimators=100, random_state=100)
ensemble = VotingClassifier(estimators=[
('lr', log_reg), ('ada', ada)
], voting='soft')
ensemble.fit(X_train_scaled, y_train)
y_pred_ens = ensemble.predict(X_test_scaled)
print("\nVoting Ensemble Report:")
print(classification_report(y_test, y_pred_ens))
print("ROC AUC:", roc_auc_score(y_test, ensemble.predict_proba(X_test_scaled)[:,1]))
Voting Ensemble Report:
precision recall f1-score support
0 0.95 0.84 0.89 562
1 0.42 0.71 0.53 92
accuracy 0.82 654
macro avg 0.68 0.77 0.71 654
weighted avg 0.87 0.82 0.84 654
ROC AUC: 0.869507194801176